-
Notifications
You must be signed in to change notification settings - Fork 3
Add CLI normalizer #503
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
gaurav
wants to merge
181
commits into
master
Choose a base branch
from
add-cli-normalizer
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Add CLI normalizer #503
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git c/src/babel_utils.py i/src/babel_utils.py index a96120d..5cbab9c 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -5,13 +5,15 @@ from enum import Enum from ftplib import FTP from io import BytesIO import gzip -from datetime import datetime as dt +from datetime import datetime as dt, datetime from datetime import timedelta import time import requests import os import urllib import jsonlines +import yaml + from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -349,10 +351,11 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): """ + :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: - :param ofname: + :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories. :param node_type: :param labels: A map of identifiers Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx). @@ -371,6 +374,32 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + metadata_dir = os.path.join(cdir,'metadata') + os.makedirs(metadata_dir, exist_ok=True) + with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + metadata = { + 'type': 'compendium', + 'name': ofname, + 'created_at': datetime.now().isoformat(), + 'concords': {} + } + for metadata_yaml in metadata_yamls: + metadata_block = yaml.safe_load(metadata_yaml) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block + + outf.write(yaml.dump(metadata)) + # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config['preferred_name_boost_prefixes']
diff --git c/src/babel_utils.py i/src/babel_utils.py index f973337..59a5360 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -14,6 +14,7 @@ import urllib import jsonlines import yaml +from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -559,44 +560,17 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, exit() # Write out the metadata.yaml file combining information from all the metadata.yaml files. - metadata_dir = os.path.join(cdir,'metadata') - os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf: - # TODO: move into metadata/provenance.py - metadata = { - 'type': 'compendium', - 'name': ofname, - 'created_at': datetime.now().isoformat(), - 'counts': { - 'cliques': count_cliques, - 'eq_ids': count_eq_ids, - 'synonyms': count_synonyms, - }, - 'concords': {} - } - for metadata_yaml in metadata_yamls: - with open(metadata_yaml, 'r') as metaf: - metadata_block = yaml.safe_load(metaf) - if metadata_block is None or metadata_block == {}: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - if 'name' not in metadata_block: - raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") - - metadata_name = metadata_block['name'] - - if type(metadata_name) != str: - raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") - - if metadata_name in metadata['concords']: - # If it's not already a list, then make it into a list. - if type(metadata['concords'][metadata_name]) != list: - metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]] - metadata['concords'][metadata_name].append(metadata_block) - else: - metadata['concords'][metadata_name] = metadata_block - - yaml.dump(metadata, outf) + write_combined_metadata( + os.path.join(cdir, 'metadata', ofname + '.yaml'), + typ='compendium', + name=ofname, + counts={ + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + }, + combined_from_filenames=metadata_yamls, + ) def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git c/src/createcompendia/drugchemical.py i/src/createcompendia/drugchemical.py index 2de4804..8dee460 100644 --- c/src/createcompendia/drugchemical.py +++ i/src/createcompendia/drugchemical.py @@ -1,5 +1,6 @@ import csv +from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import NodeFactory, InformationContentFactory from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, @@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui): print(x) exit() -def build_rxnorm_relationships(conso, relfile, outfile): +def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): """RXNREL is a lousy file. The subject and object can sometimes be a CUI and sometimes an AUI and you have to use CONSO to figure out how to go back and forth. @@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile): #This is maybe relying on convention a bit too much. if outfile == "UMLS": prefix = UMLS + sources = [ + { + 'type': 'UMLS', + 'name': 'MRCONSO', + 'filename': conso + }, + { + 'type': 'UMLS', + 'name': 'MRREL', + 'filename': relfile + } + ] else: prefix = RXCUI + sources = [ + { + 'type': 'RXNORM', + 'name': 'RXNCONSO', + 'filename': conso + }, + { + 'type': 'RXNOM', + 'name': 'RXNREL', + 'filename': relfile + } + ] aui_to_cui, sdui_to_cui = get_aui_to_cui(conso) # relfile = os.path.join('input_data', 'private', "RXNREL.RRF") single_use_relations = {"has_active_ingredient": defaultdict(set), @@ -214,6 +239,13 @@ def build_rxnorm_relationships(conso, relfile, outfile): continue outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n") + write_concord_metadata( + metadata_yaml, + name='build_rxnorm_relationships()', + description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', + sources=sources + ) + def load_cliques(compendium): rx_to_clique = {} @@ -228,7 +260,7 @@ def load_cliques(compendium): rx_to_clique[terms["i"]] = clique return rx_to_clique -def build_pubchem_relationships(infile,outfile): +def build_pubchem_relationships(infile,outfile, metadata_yaml): with open(infile,"r") as inf: document = json.load(inf) with open(outfile,"w") as outf: @@ -238,7 +270,19 @@ def build_pubchem_relationships(infile,outfile): for cid in cids: outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n") -def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename): + write_concord_metadata( + metadata_yaml, + name='build_pubchem_relationships()', + description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.', + sources=[{ + 'type': 'PubChem', + 'name': 'PubChem RxNorm annotations', + 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', + 'filename': infile + }] + ) + +def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): """RXN_concord contains relationshps between rxcuis that can be used to conflate Now we don't want all of them. We want the ones that are between drugs and chemicals, and the ones between drugs and drugs. @@ -556,6 +600,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem outfile.write(f"{json.dumps(final_conflation_id_list)}\n") written.add(fs) + # Write out metadata.yaml + write_combined_metadata( + output_metadata_yaml, + typ='conflation', + name='drugchemical.build_conflation()', + description='Build DrugChemical conflation.', + combined_from_filenames=input_metadata_yamls + ) + def sort_by_curie_suffix(curie): """ diff --git c/src/metadata/provenance.py i/src/metadata/provenance.py index 54bc50e..5a8f703 100644 --- c/src/metadata/provenance.py +++ i/src/metadata/provenance.py @@ -1,3 +1,4 @@ +import os.path from datetime import datetime import yaml @@ -8,13 +9,56 @@ def write_download_metadata(filename, name, url='', description='', sources=None def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None): - if type(name) != str: +def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None): + combined_from = {} + if combined_from_filenames is not None: + for metadata_yaml in combined_from_filenames: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if type(metadata_name) is not str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + + if metadata_name in combined_from: + # If it's not already a list, then make it into a list. + if type(combined_from[metadata_name]) is not list: + combined_from[metadata_name] = [combined_from[metadata_name]] + combined_from[metadata_name].append(metadata_block) + else: + combined_from[metadata_name] = metadata_block + + write_metadata( + filename, + typ=typ, + name=name, + sources=sources, + url=url, + description=description, + counts=counts, + combined_from=combined_from + ) + +def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None): + if type(typ) is not str: + raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") + if type(name) is not str: raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] if counts is None: counts = [] + if combined_from is None: + combined_from = [] + + metadata_dir = os.path.dirname(filename) + os.makedirs(metadata_dir, exist_ok=True) with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), @@ -24,4 +68,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description='', co 'description': description, 'sources': sources, 'counts': counts, + 'combined_from': combined_from, }, fout) diff --git c/src/snakefiles/drugchemical.snakefile i/src/snakefiles/drugchemical.snakefile index 9640c13..3f6a8d3 100644 --- c/src/snakefiles/drugchemical.snakefile +++ i/src/snakefiles/drugchemical.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.drugchemical as drugchemical import src.synonyms.synonymconflation as synonymconflation import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Drug / Chemical @@ -9,39 +10,56 @@ rule rxnorm_relationships: rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF", rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml' run: - drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml) rule umls_relationships: input: umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF", umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml' run: - drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml) rule pubchem_rxnorm_relationships: input: infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml' run: - drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords) + drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule drugchemical_conflation: input: drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt', chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']), rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM', + rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml', umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml', pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM', + pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml', drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: - outfile=config['output_directory']+'/conflation/DrugChemical.txt' + outfile=config['output_directory']+'/conflation/DrugChemical.txt', + metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', + drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: + write_concord_metadata(input.drugchemical_manual_metadata, + name='Manual DrugChemical Concords', + description='Manually curated DrugChemical conflation cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', + ) drugchemical.build_conflation( input.drugchemical_manual_concord, input.rxnorm_concord, @@ -50,7 +68,13 @@ rule drugchemical_conflation: input.drug_compendium, input.chemical_compendia, input.icrdf_filename, - output.outfile) + output.outfile, + input_metadata_yamls={ + 'RXNORM': input.rxnorm_metadata, + 'UMLS': input.umls_metadata, + 'PUBCHEM_RXNORM': input.pubchem_metadata, + 'Manual': input.drugchemical_manual_metadata, + }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input:
Since MeSH is not an ids file for proteins, this should only pull in MeSH IDs that are associated with a UMLS ID.
Could also be useful to track memory in the future.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
We have several requests/needs for files containing CURIEs to be normalized in bulk. There are multiple ways of doing this using NodeNorm, but it would be nice to have something that can do it as an INNER JOIN against the combined DuckDB database we create while building NodeNorm, as that should be way faster than other approaches. We could also use this to export every mapping we have from a particular source (i.e. https://github.com/TranslatorSRI/NodeNormalization/issues/321).
WIP. Should be merged after PR #495.